# ANES_createImputedDatasets.R

# Part of the replication archive for 
#
#   Bullock, John G. 2020. "Education and Attitudes toward Redistribution in
#   the United States." British Journal of Political Science 50.


# Coding rules:
# --Cases should be excluded from the missing-data imputation if they  
#   (a) are missing data on the dependent variable in my IV analyses, or (b) 
#   are for people who were never assigned to be asked the dependent-variable
#   question.  I code these cases as 0.  makePredictorMatrix() then omits them 
#   from the analysis.  (Note that makePredictorMatrix() omits cases that have 
#   "0" values *in the dependent variable*.  It does not pay attention to "0" 
#   values in other variables.)
#
# Some people were never assigned to be asked the questions that I use in my 
# analysis.  This mainly occurs because a question was not asked of anyone in 
# a particular year.  It also occurs for other reasons, e.g., split-ballot 
# sampling.  These cases are initially coded as 0 and are not used for 
# imputation.  



##############################################################################
# PRELIMINARIES
##############################################################################
library(Amelia)
library(Bullock, lib.loc = c(.libPaths(), 'packageLibrary'))  # for Bullock::rescale()
library(car)      # for Recode()
library(dplyr)    # for bind_rows(), %>%
library(foreach)  # to use multiple cores
library(haven)    # for read_spss()
if (Sys.info()['sysname'] == 'Windows') { 
  library(doParallel)  # to use multiple cores
} 
source('ANES_coding_yearsOfSchooling.R')
source('ANES_stateYoungAugmentation.R')
source('functions/makePredictorMatrix.R')
source('functions/NES_StateRecode.R')
source('functions/tabNA.R')
dirOutput <- 'data/'


# LOAD CUMULATIVE ANES
ANES_cumulative <- tempfile(fileext = '.zip')
download.file(
  url      = 'https://electionstudies.org/wp-content/uploads/2018/12/anes_timeseries_cdf_dta.zip', 
  destfile = ANES_cumulative)
ANES_forImputation <- unz(ANES_cumulative, 'anes_timeseries_cdf_stata13.dta') %>% 
  read_dta() %>%
  rename_all(toupper) %>% 
  select(
    VCF0004,
    VCF0006A,
    VCF0101,
    VCF0104,
    VCF0106,
    VCF0112,
    VCF0116,
    VCF0132,
    VCF0133,    
    VCF0140,
    VCF0142,    
    VCF0806,
    VCF0809,
    VCF0901B,
    VCF9013,
    VCF9015,

    VCF0111,
    VCF0114,
    VCF0118,
    VCF0119,
    VCF0127,
    VCF0128,
    VCF0130,
    VCF0137,
    VCF0143,
    VCF0146,
    VCF0147,
    VCF0156,
    VCF0157,
    VCF9002,
  
    # Group thermometers
    VCF0206,
    VCF0207,
    VCF0208,
    VCF0209,
    VCF0210,
    VCF0211,
    VCF0212,
    VCF0220,
    VCF0223,
    VCF9004,
    VCF9006,
      
    # Partisanship
    VCF0301,
    VCF0201,
    VCF0202,
    VCF0218,
    VCF0224,
    VCF0324,
    VCF0411,
      
    # Trust and support of political system
    VCF0604,
    VCF0605,
    VCF0606,
    VCF0619,
    VCF0620,
    VCF0621, 
    VCF0704A,
      
    # Ideology, issue positions, and related questions
    VCF0803,
    VCF0830,
    VCF0834,
    VCF0838,
    VCF0839,
    VCF0846,
    VCF0847,
    VCF0851,
    VCF0852,
    VCF0853,
    VCF0854,
    VCF0867A,
    VCF0871,
    VCF0872,
    VCF0880,
    VCF0881) %>%
  zap_labels()



##############################################################################
# CODE ESSENTIAL VARIABLES 
##############################################################################
# Change the variable so that it is ready for missing-data imputation.
# DK = NA, INAP = 0. 

# GOVERNMENT-GUARANTEED STANDARD OF LIVING
guarantee.7pt <- ANES_forImputation$VCF0809 %>% as.integer()
guarantee.7pt[is.na(ANES_forImputation$VCF0809)] <- 0
guarantee.7pt[ANES_forImputation$VCF0809==9]     <- NA  # 9 is DK

# GOVERNMENT-PROVIDED HEALTH CARE
govt.health.7pt <- ANES_forImputation$VCF0806 %>% as.integer()
govt.health.7pt[is.na(ANES_forImputation$VCF0806)] <- 0
govt.health.7pt[ANES_forImputation$VCF0806==9]     <- NA

# YEARS OF EDUCATION ("EDUC")
ANES_forImputation$ID.unique <- ANES_forImputation$VCF0006A %>% as.integer()  # unique respondent number
ANES_forImputation$yearInt   <- ANES_forImputation$VCF0004  %>% as.integer()  # year of interview
ANES_forImputation <- left_join(ANES_forImputation, ANES_codeYearsOfSchooling(), by = "ID.unique") %>%
  mutate(
    educ      = educYearsUncensored,
    yearsTo13 = pmin(educYearsUncensored, 13))


# STATE-OF-RESIDENCE VARIABLES
age               <- Recode(ANES_forImputation$VCF0101, '0 = NA') %>% as.integer()
yearYoung         <- ANES_forImputation$yearInt - age + 14                           # year at which subject was 14
stateYoung        <- ANES_forImputation$VCF0132                                 # state where "you grew up" 
stateYoung[is.na(stateYoung)] <- ANES_forImputation$VCF0133[is.na(stateYoung)]  # again, state where "you grew up"
stateYoung        <- NES_StateRecode(stateYoung) 
state.contemp     <- Recode(ANES_forImputation$VCF0901B, 'c("HI", 99)=NA')      # state of current residence
region.contemp    <- Recode(ANES_forImputation$VCF0112, '1="Northeast"; 2="North Central"; 3="South"; 4="West"', as.factor=TRUE)
stateOfBirth      <- NES_StateRecode(ANES_forImputation$VCF0142)  
stateYoungAugmented.df <- data.frame(
  age, yearYoung, stateYoung, state.contemp, region.contemp, stateOfBirth, 
  yearInt = ANES_forImputation$yearInt, ID.unique = ANES_forImputation$ID.unique)
stateYoungAugmented.df <- ANES_stateYoungAugmentation(ANES_obj = stateYoungAugmented.df)


# OTHER VARIABLES
female     <- Recode(ANES_forImputation$VCF0104, '0 = NA') == 2
educ5Level <- Recode(
  var       = ANES_forImputation$VCF0140,
  recodes   = '1="max8"; 2="max12"; 3:4="HSdiploma"; 5="some college"; 6="BA or higher"; else=NA', 
  levels    = c('max8', 'max12', 'HSdiploma', 'some college', 'BA or higher'),
  as.factor = TRUE)
race       <- Recode(
  var       = ANES_forImputation$VCF0106, 
  recodes   = '1="white"; 2="black"; 3="otherRace"; else=NA', 
  as.factor = TRUE)                
bornInUS   <- Recode(
  var       = ANES_forImputation$VCF0142, 
  recodes   = '101:199 = TRUE; c(998, 999, 000) = NA; else = FALSE') == 1



##############################################################################
# SAVE "MISSINGNESS" DATASET FOR FIGURES A11 AND A12 
##############################################################################
# ID.unique <- ANES_forImputation$ID.unique
# yearInt   <- ANES_forImputation$yearInt
saveRDS(ANES_forImputation, file = 'data/ANES_missingness.RDS')



##############################################################################
# CREATE DATA FRAMES FOR IMPUTATION 
##############################################################################

# REQUIRED VARIABLES
reqd.vars <- data.frame(
  respondentID = ANES_forImputation$VCF0006A,  # unique respondent ID (so some IDs will appear more than once)
  yearYoung,
  state.contemp,
  region.contemp,
  stateOfBirth,
  stateYoung = stateYoungAugmented.df$stateYoung,
  govt.health.7pt,
  guarantee.7pt,
  yearInt = ANES_forImputation$yearInt,
  female,
  age,
  educ = ANES_forImputation$educ,
  educ5Level,  
  race,
  bornInUS)


cand.vars <- with(ANES_forImputation, data.frame(
  
  # Demographics  
  urbanism           = VCF0111,
  incomeFam          = VCF0114,
  workStatus         = VCF0118,
  workStatusHead     = VCF0119,
  union              = VCF0127,
  religion           = VCF0128,
  relAttendance      = VCF0130,
  typePlaceGrewUp    = VCF0137,
  parentsNatives     = VCF0143,
  homeOwned          = VCF0146,
  maritalStatus      = VCF0147,
  laidOffRecently    = VCF0156,
  hoursCutRecently   = VCF0157,
  yearsInCommunity   = VCF9002,
  
  # Group thermometers
  therm.blacks        = VCF0206,
  therm.whites        = VCF0207,
  therm.Southerners   = VCF0208,
  therm.business      = VCF0209,
  therm.unions        = VCF0210,
  therm.liberals      = VCF0211,
  therm.conservatives = VCF0212,
  therm.welfareRecips = VCF0220,
  therm.poorPeople    = VCF0223,
  therm.elderly       = VCF9004,
  therm.women         = VCF9006,
  
  # Partisanship
  PID                  = VCF0301,
  therm.Dems           = VCF0201,
  therm.Reps           = VCF0202,
  therm.DemP           = VCF0218,
  therm.RepP           = VCF0224,
  likesDislikesParties = VCF0324,
  likesDislikesCands   = VCF0411,
  
  # Trust and support of political system
  trustGovToDoRight     = VCF0604,
  govRunForBenefitOfAll = VCF0605,
  howMuchGovtWaste      = VCF0606,
  peopleCanBeTrusted    = VCF0619,
  peopleAreHelpful      = VCF0620,
  peopleTryToBeFair     = VCF0621, 
  votedforRepPOTUS      = VCF0704A,
  
  # Ideology, issue positions, and related questions
  libcon                  = VCF0803,
  aidToBlacks             = VCF0830,
  womenEqualRole          = VCF0834,
  allowAbortion           = VCF0838,
  servSpend               = VCF0839,
  relImportant            = VCF0846,
  relImpDaytoDay          = VCF0847,
  newLifestylesBad        = VCF0851,
  adjustMoralViews        = VCF0852,
  moreTradValues          = VCF0853,
  diffMoralStandardsOK    = VCF0854,
  affirmativeActionBlacks = VCF0867A,
  economyBetterInPastYear = VCF0871,
  economyBetterNextYear   = VCF0872,
  betterOffInPastYear     = VCF0880,
  betterOffNextYear       = VCF0881))
  
  
  
##############################################################################
# RECODE VARIABLES AND CHANGE THEIR CLASSES
##############################################################################
cand.vars <- mutate_all(cand.vars, as.integer)  
cand.vars$urbanism                <- Recode(cand.vars$urbanism,  '0 = NA') 
cand.vars$incomeFam               <- factor(Recode(cand.vars$incomeFam, '0 = NA'))
cand.vars$workStatus              <- factor(Recode(cand.vars$workStatus, 'c(0,9)=NA'))
cand.vars$workStatusHead          <- factor(Recode(cand.vars$workStatusHead, '9 = NA'))
cand.vars$union                   <- Recode(cand.vars$union, '0=NA; 1=1; 2=0')
cand.vars$religion                <- factor(Recode(cand.vars$religion, '0=NA')) 
cand.vars$relAttendance           <- Recode(cand.vars$relAttendance, 'c(0, 8, 9)=NA; 7=6') 
cand.vars$typePlaceGrewUp         <- Recode(cand.vars$typePlaceGrewUp, '9=NA')
cand.vars$parentsNatives          <- Recode(cand.vars$parentsNatives, 'c(8,9)=NA')==1
cand.vars$homeOwned               <- Recode(cand.vars$homeOwned, 'c(8,9)=NA')==1
cand.vars$maritalStatus           <- factor(Recode(cand.vars$maritalStatus, '8:9=NA'))
cand.vars$yearsInCommunity        <- Recode(cand.vars$yearsInCommunity, 'c(9,0)=NA')
cand.vars$PID                     <- Recode(cand.vars$PID, '0 = NA')
cand.vars$likesDislikesParties    <- Recode(cand.vars$likesDislikesParties, '999=NA')
cand.vars$likesDislikesCands      <- Recode(cand.vars$likesDislikesCands, '999=NA') 
cand.vars$trustGovToDoRight       <- Recode(cand.vars$trustGovToDoRight, 'c(0,9)=NA')
cand.vars$govRunForBenefitOfAll   <- Recode(cand.vars$govRunForBenefitOfAll, 'c(0,9)=NA')==2
cand.vars$howMuchGovtWaste        <- Recode(cand.vars$howMuchGovtWaste, 'c(0,9)=NA')
cand.vars$libcon                  <- Recode(cand.vars$libcon, 'c(0,9)=NA')
cand.vars$aidToBlacks             <- Recode(cand.vars$aidToBlacks, 'c(0,9)=NA')
cand.vars$womenEqualRole          <- Recode(cand.vars$womenEqualRole, 'c(0,9)=NA')
cand.vars$allowAbortion           <- Recode(cand.vars$allowAbortion, 'c(0,9)=NA')
cand.vars$servSpend               <- Recode(cand.vars$servSpend, 'c(0,9)=NA')
cand.vars$relImportant            <- Recode(cand.vars$relImportant, 'c(0,8)=NA')==1
cand.vars$relImpDaytoDay          <- Recode(cand.vars$relImpDaytoDay, 'c(0,8)=NA; 5=0')
cand.vars$newLifestylesBad        <- Recode(cand.vars$newLifestylesBad, '8:9=NA')
cand.vars$adjustMoralViews        <- Recode(cand.vars$adjustMoralViews, '8:9=NA')
cand.vars$moreTradValues          <- Recode(cand.vars$moreTradValues, '8:9=NA')
cand.vars$diffMoralStandardsOK    <- Recode(cand.vars$diffMoralStandardsOK, '8:9=NA')
cand.vars$affirmativeActionBlacks <- Recode(cand.vars$affirmativeActionBlacks, '7:9=NA')
cand.vars$economyBetterInPastYear <- Recode(cand.vars$economyBetterInPastYear, 'c(0,8,9)=NA')
cand.vars$economyBetterNextYear   <- Recode(cand.vars$economyBetterNextYear, '3=2;5=3;8:9=NA')
cand.vars$betterOffInPastYear     <- Recode(cand.vars$betterOffInPastYear, 'c(0,9)=NA')
cand.vars$betterOffNextYear       <- Recode(cand.vars$betterOffNextYear, 'c(0,9)=NA')

therm.questions <- grepl('^therm\\.', names(cand.vars))
cand.vars[therm.questions] <- lapply(cand.vars[therm.questions], function (x) Recode(x, '98:100=NA'))

peopleTrust.questions <- grepl('^people', names(cand.vars))
cand.vars[peopleTrust.questions] <- lapply(cand.vars[peopleTrust.questions], function (x) Recode(unclass(x), '1=NA; 2=1; 4=2'))



##############################################################################
# ADD CASES FROM THE 1970 ANES SUPPLEMENT  
##############################################################################
# Respondents in the 1970 ANES supplement are not included in the cumulative  
# ANES.
source('ANES_1970_supplement.R')
ANES1970$stateYoung <- droplevels(ANES1970$stateYoung)
reqd.vars <- bind_rows(
  reqd.vars, 
  ANES1970[!ANES1970$ID.unique %in% reqd.vars$respondentID, colnames(ANES1970) %in% colnames(reqd.vars)] )
reqd.vars <- reqd.vars %>%
  mutate(
    stateYoung = factor(stateYoung),
    state.contemp = factor(state.contemp))


# Adjust cand.vars. I haven't coded any candidate variables in the 1970 
# dataset, so this just amounts to appending NA rows.
tmp <- matrix(
  NA, 
  nrow     = nrow(reqd.vars) - nrow(cand.vars), 
  ncol     = ncol(cand.vars),
  dimnames = list(NULL, colnames(cand.vars)))
cand.vars <- bind_rows(cand.vars, data.frame(tmp))  



##############################################################################
# CHOOSE CANDIDATE VARIABLES AND RUN AMELIA
##############################################################################
guarantee.imput.df <- makePredictorMatrix(
  depvarName = 'guarantee.7pt',   
  reqd.vars  = reqd.vars, 
  cand.vars  = cand.vars,
  ID         = "ID.unique")
health.imput.df <- makePredictorMatrix(
  depvarName = 'govt.health.7pt', 
  reqd.vars  = reqd.vars, 
  cand.vars  = cand.vars,
  ID         = "ID.unique")

# RESCALE OUTCOMES FROM 0 TO 1  
# This rescaling should be done before the imputation.  But it should also be
# done after the "0" responses have been eliminated from the data frame -- 
# that is, after makePredictorMatrix() has been run.
guarantee.imput.df$guarantee.7pt <- Bullock::rescale(guarantee.imput.df$guarantee.7pt)
health.imput.df$govt.health.7pt  <- Bullock::rescale(health.imput.df$govt.health.7pt)


# SET UP MULTIPLE IMPUTATION FUNCTION
# Imputation is slow, and by default, it runs only one one core.  The extra 
# code here makes use of multiple cores.  For more information, see
# https://lists.gking.harvard.edu/pipermail/amelia/2012-July/000896.html.  
imputeMultiCore <- function (x, m, ...) {
  foreach (
    i             = 1:m,
    .combine      = "ameliabind",
    .multicombine = TRUE,
    .packages     = "Amelia",
    .inorder      = FALSE
  ) %dopar% {
    amelia(x, m = 1, ...)
  } 
}


# SET SEED
set.seed(1977)


# REGISTER MULTIPLE CORES
if (Sys.info()['sysname'] == 'Windows') { 
  cl <- makeCluster(4)
  registerDoParallel(cl)
} 


# RUN AMELIA FOR EACH ANES DEPENDENT VARIABLE
cat("Imputing missing data for variables related to the ANES health-care question...\n")
health.out <- imputeMultiCore(
  x        = health.imput.df, 
  m        = 10,  # number of imputed datasets to create 
  ts       = 'yearInt', 
  idvars   = qw("respondentID yearYoung stateYoung state.contemp 
    region.contemp stateOfBirth"),
  ords     = c("educ5Level", "incomeFam"),
  noms     = which(
    lapply(health.imput.df, class) == 'factor' & 
      !names(health.imput.df) %in% qw('educ5Level incomeFam state.contemp region.contemp stateOfBirth stateYoung')),
  polytime = 1, 
  empri    = .01) 

cat("Imputing missing data for variables related to the ANES guaranteed-standard-of-living question...\n")
guarantee.out <- imputeMultiCore(
  x        = guarantee.imput.df, 
  m        = 10, 
  ts       = 'yearInt', 
  idvars   = qw("respondentID yearYoung stateYoung state.contemp 
      region.contemp stateOfBirth"),
  ords     = c("educ5Level", "incomeFam"),
  noms     = which(
    lapply(guarantee.imput.df, class) == 'factor' & 
      !names(guarantee.imput.df) %in% qw('educ5Level incomeFam state.contemp region.contemp stateOfBirth stateYoung')),
  polytime = 1, 
  empri    = .02) 


save(health.out,    file = paste0(dirOutput, 'ANESImputedDatasetHealth.RData'))
save(guarantee.out, file = paste0(dirOutput, 'ANESImputedDatasetGuarantee.RData'))



####################################################################
# STOP THE CLUSTER AND QUIT
####################################################################
if (Sys.info()['sysname'] == 'Windows') stopCluster(cl)
